In [84]:
# KNN, or K-Nearest Neighbor Algorithm
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# Load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data")  # Change the working directory
Auto = pd.read_csv("Auto.csv")  # Read the data file in the CSV format
Auto['horsepower'] = pd.to_numeric(Auto['horsepower'], errors='coerce')
In [85]:
# Summary of mpg
print(Auto.describe())
              mpg   cylinders  displacement  horsepower       weight  \
count  397.000000  397.000000    397.000000  392.000000   397.000000   
mean    23.515869    5.458438    193.532746  104.469388  2970.261965   
std      7.825804    1.701577    104.379583   38.491160   847.904119   
min      9.000000    3.000000     68.000000   46.000000  1613.000000   
25%     17.500000    4.000000    104.000000   75.000000  2223.000000   
50%     23.000000    4.000000    146.000000   93.500000  2800.000000   
75%     29.000000    8.000000    262.000000  126.000000  3609.000000   
max     46.600000    8.000000    455.000000  230.000000  5140.000000   

       acceleration        year      origin  
count    397.000000  397.000000  397.000000  
mean      15.555668   75.994962    1.574307  
std        2.749995    3.690005    0.802549  
min        8.000000   70.000000    1.000000  
25%       13.800000   73.000000    1.000000  
50%       15.500000   76.000000    1.000000  
75%       17.100000   79.000000    2.000000  
max       24.800000   82.000000    3.000000  
In [86]:
# Some horsepower = NaN, we'll drop these cars.
Auto.dropna(inplace=True)
In [87]:
# Initiate a fuel consumption rating variable.
# 'Economy', the fuel consumption rating, will be defined 
# as a categorical variable, based on miles per gallon

Auto['Economy'] = pd.cut(Auto['mpg'],
                              bins=[-np.inf, 17, 23, 29, np.inf],
                              labels=['Heavy', 'OK', 'Eco', 'Excellent'])

print(Auto['Economy'].value_counts())        # Group counts
Economy
OK           106
Heavy         99
Excellent     95
Eco           92
Name: count, dtype: int64
In [ ]:
# We used sample quartiles of variable mpg to define these ratings, that’s why we got four approximately equal groups.
# Now, we’ll derive a classification rule, using other car characteristics
In [136]:
# Prepare training and testing data, predictors (X) and responses (Y)

X = Auto.iloc[:, 1:7]  # columns from cylinders to year
Y = Auto['Economy']

# KNN requires 4 inputs: training X, testing X, training Y, and K. 
# We'll use Y_test for performance evaluation and tuning.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=42)   # Split data at random
In [94]:
# KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
knn_result = knn.predict(X_test)
In [96]:
# Confusion matrix and accuracy
conf_matrix = confusion_matrix(Y_test, knn_result)
print(conf_matrix)
[[23 26  0  2]
 [ 5 31  0  4]
 [ 0  0 39  9]
 [23  5  4 25]]
In [98]:
accuracy = accuracy_score(Y_test, knn_result)
print(f'Accuracy: {accuracy}')
Accuracy: 0.6020408163265306
In [138]:
# 60.2% correct classification rate with K=3. Is there a better K? Check accuracy for K from 1 to 20

K = list(range(1,20))
class_rate = []

for k in K:                                        
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    knn_result = knn.predict(X_test)
    accuracy = accuracy_score(Y_test, knn_result)
    class_rate.append(accuracy)

plt.figure
plt.plot(K, class_rate)
plt.xlabel('K'); plt.ylabel('Classification rate'); plt.title('Tuning of the KNN algorithm'); 
plt.show()
No description has been provided for this image
In [140]:
tuning_result = pd.DataFrame({'K':K, 'class_rate':class_rate})
In [142]:
print(tuning_result)
     K  class_rate
0    1    0.612245
1    2    0.607143
2    3    0.602041
3    4    0.622449
4    5    0.596939
5    6    0.596939
6    7    0.596939
7    8    0.596939
8    9    0.591837
9   10    0.607143
10  11    0.581633
11  12    0.591837
12  13    0.586735
13  14    0.586735
14  15    0.586735
15  16    0.586735
16  17    0.602041
17  18    0.612245
18  19    0.607143
In [ ]:
# Apparently, K=4 provides a better classification with the rate of 62.2%.